Fijamos la semilla para poder reproducir los experimentos. También se fija el numero de CPU’s a utilizar.
set.seed(42)
options(mc.cores = 24)Se importan las librerÃas a utilizar a lo largo de la notebook:
# install.packages(pacman)
# install.packages("https://cran.r-project.org/src/contrib/rstan_2.21.2.tar.gz",repos = NULL,type="source")
# sudo apt-get install libglpk-devlibrary(pacman)
p_load(tidyverse, tidymodels, rsample, rstan, shinystan, rstanarm, devtools)
p_load_gh('adrianmarino/commons')
import('../src/dataset.R')## [1] "-> '../src/dataset.R' script loadded successfuly!"
import('../src/plot.R')## [1] "-> '../src/plot.R' script loadded successfuly!"
import('../src/model.R')## [1] "-> './bayesian_regression_predictor.R' script loadded successfuly!"
## [1] "-> '../src/model.R' script loadded successfuly!"
Palmer Penguins
dataset <- load_dataset() %>% mutate_if(is.character, as.factor)
dataset %>% glimpse()## Rows: 344
## Columns: 8
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <dbl> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g <dbl> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex <fct> male, female, female, NA, female, male, female, male…
## $ year <dbl> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
Se enumeran y describen breve-mente cada variable que forma parte del dataset:
Variables Numéricas:
Variables Categóricas:
A continuación veamos los posibles valores de las variables categóricas:
show_values(dataset %>% select_if(negate(is.numeric)))## _____________
## species n
## =============
## Adelie 152
## Chinstrap 68
## Gentoo 124
## ¯¯¯¯¯¯¯¯¯¯¯¯¯
## _____________
## island n
## =============
## Biscoe 168
## Dream 124
## Torgersen 52
## ¯¯¯¯¯¯¯¯¯¯¯¯¯
## __________
## sex n
## ==========
## female 165
## male 168
## NA 11
## ¯¯¯¯¯¯¯¯¯¯
missings_summary(dataset)hist_plots(dataset)Observaciones
box_plots(dataset)Observaciones
** COMPLETAR
No se registran valores mas extremos que el mÃnimo y máximo valor en cada variables. Es decir que no encontramos outliers.
outliers(dataset, column='bill_length_mm')## $inf
## [1] 32.1
##
## $sup
## [1] 59.6
outliers(dataset, column='bill_length_mm')## $inf
## [1] 32.1
##
## $sup
## [1] 59.6
outliers(dataset, column='bill_depth_mm')## $inf
## [1] 13.1
##
## $sup
## [1] 21.5
outliers(dataset, column='flipper_length_mm')## $inf
## [1] 172
##
## $sup
## [1] 231
outliers(dataset, column='body_mass_g')## $inf
## [1] 2700
##
## $sup
## [1] 6300
outliers(dataset, column='year')## $inf
## [1] 2007
##
## $sup
## [1] 2009
bar_plots(dataset)Observaciones
dataset <- dataset %>% drop_na()
missings_summary(dataset)## Warning: attributes are not identical across measure variables;
## they will be dropped
corr_plot(dataset %>% dplyr::select(-year))segmented_pairs_plot(dataset, segment_column='species')train_test <- train_test_split(dataset, train_size = 0.7, shuffle = TRUE)## [1] "Train set size: 233"
## [1] "Test set size: 100"
train_set <- train_test[[1]]
test_set <- train_test[[2]]lineal_model_1 <- lm(
body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = train_set
)bayesion_model_1 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x1;
vector<lower=1>[obs_count] x2;
vector<lower=1>[obs_count] x3;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real beta2;
real beta3;
real<lower=0> sigma;
}
model {
beta0 ~ normal(0, 8000);
beta1 ~ normal(0, 100);
beta2 ~ normal(0, 100);
beta3 ~ normal(0, 100);
sigma ~ exponential(0.1);
y ~ normal(beta0 + beta1 * x1 + beta2 * x2 + beta3 * x3, sigma);
}
",
data = list(
obs_count = nrow(train_set),
y = colvalues(train_set, 'body_mass_g'),
x1 = colvalues(train_set, 'bill_length_mm'),
x2 = colvalues(train_set, 'bill_depth_mm'),
x3 = colvalues(train_set, 'flipper_length_mm')
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
params_1 <- c('beta0', 'beta1', 'beta2', 'beta3', 'sigma')
traceplot(bayesion_model_1, pars = params_1, inc_warmup = TRUE)lm_vs_br_coeficients(lineal_model_1, bayesion_model_1, params_1)vars_1 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm')
models_validation(lineal_model_1, bayesion_model_1, params_1, vars_1, test_set)bayesion_predictor_1 <- BayesianRegressionPredictor.from(bayesion_model_1, params_1, vars_1)
plot_compare_fit(
lineal_model_1,
bayesion_predictor_1,
train_set,
label_1='Regresion Lineal',
label_2='Regresion Bayesiana'
)lineal_model_2 <- lm(
body_mass_g
~ bill_length_mm
+ bill_depth_mm
+ flipper_length_mm
+ sex,
data = train_set
)Antes que anda transformamos la columna categórica a un one-hot encoding:
cat_cols <- c('sex')
train_set_cat <- train_set %>% dummify(cat_cols)
test_set_cat <- test_set %>% dummify(cat_cols)
train_set_catConstruimos una matriz con todas las variables x + intercept.
to_model_input <- function(df) {
model.matrix(
body_mass_g
~ bill_length_mm
+ bill_depth_mm
+ flipper_length_mm
+ sex_female
+ sex_male,
data = df
)
}
train_X <- train_set_cat %>% to_model_input()
test_X <- test_set_cat %>% to_model_input()bayesion_model_2 <- stan(
model_code = "
data {
int<lower=1> obs_count;
int<lower=1> coef_count;
matrix[obs_count,coef_count] X;
vector[obs_count] y;
}
parameters {
vector[coef_count] beta;
real<lower=0> sigma;
}
model {
beta[1] ~ normal(0, 2000);
beta[2] ~ normal(0, 30);
beta[3] ~ normal(0, 100);
beta[4] ~ normal(0, 100);
beta[5] ~ normal(0, 100);
beta[6] ~ normal(0, 100);
sigma ~ exponential(0.1);
y ~ normal(X * beta, sigma);
}
",
data = list(
obs_count = dim(train_X)[1],
coef_count = dim(train_X)[2],
y = colvalues(train_set_cat, 'body_mass_g'),
X = train_X
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
params_2 <- c('beta[1]', 'beta[2]', 'beta[3]', 'beta[4]', 'beta[5]', 'beta[6]', 'sigma')
traceplot(bayesion_model_2, inc_warmup = TRUE, pars = params_2)lineal_model_2$coefficients## (Intercept) bill_length_mm bill_depth_mm flipper_length_mm
## -1922.6152066 -0.5330962 -92.9216684 37.2016333
## sexmale
## 534.1583252
br_coeficients(bayesion_model_2, params_2)vars_2 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'sex_female','sex_male')
models_validation(
lineal_model_2,
bayesion_model_2,
params_2,
vars_2,
test_set,
test_set_cat
)bayesion_predictor_2 <- BayesianRegressionPredictor.from(bayesion_model_2, params_2, vars_2)
plot_compare_fit(
lineal_model_2,
bayesion_predictor_2,
test_set,
test_set_cat,
label_1='Regresion Lineal',
label_2='Regresion Bayesiana'
)plot_data(train_set)train_set_with_outliers <- train_set %>%
mutate(flipper_length_mm = ifelse(
body_mass_g > 5600 ,
flipper_length_mm + (flipper_length_mm * runif(1, 0.2, 0.3)),
flipper_length_mm
))
plot_data(train_set_with_outliers)lineal_model_3 <- lm(
body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = train_set_with_outliers
)plot_compare_fit(
lineal_model_1,
lineal_model_3,
train_set,
label_1='Regresión Lineal SIN outliers',
label_2='Regresión Lineal CON outliters'
)lm_vs_lm_coeficients(lineal_model_1, lineal_model_3) bayesion_model_3 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x1;
vector<lower=1>[obs_count] x2;
vector<lower=1>[obs_count] x3;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real beta2;
real beta3;
real<lower=0> sigma;
}
model {
beta0 ~ normal(0, 8000);
beta1 ~ normal(0, 100);
beta2 ~ normal(0, 100);
beta3 ~ normal(0, 100);
sigma ~ exponential(0.1);
y ~ normal(beta0 + beta1 * x1 + beta2 * x2 + beta3 * x3, sigma);
}
",
data = list(
obs_count = nrow(train_set_with_outliers),
y = colvalues(train_set_with_outliers, 'body_mass_g'),
x1 = colvalues(train_set_with_outliers, 'bill_length_mm'),
x2 = colvalues(train_set_with_outliers, 'bill_depth_mm'),
x3 = colvalues(train_set_with_outliers, 'flipper_length_mm')
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)
params_3 <- c('beta0', 'beta1', 'beta2', 'beta3', 'sigma')
traceplot(bayesion_model_3, pars = params_3, inc_warmup = TRUE)lm_vs_br_coeficients(lineal_model_3, bayesion_model_3, params_3)vars_3 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm')
models_validation(lineal_model_3, bayesion_model_3, params_3, vars_3, test_set)bayesion_predictor_3 <- BayesianRegressionPredictor.from(bayesion_model_3, params_3, vars_3)
plot_compare_fit(
lineal_model_1,
bayesion_predictor_3,
train_set,
label_1='Regresion Lineal SIN outliers',
label_2='Regresion Bayesiana CON outliers'
)En este aso entrenamos solo con el 10% de lo datos.
train_test <- train_test_split(dataset, train_size = 0.05, shuffle = TRUE)## [1] "Train set size: 16"
## [1] "Test set size: 317"
train_set_4 <- train_test[[1]]
test_set_4 <- train_test[[2]]plot_data(train_set_4)lineal_model_4 <- lm(
body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = train_set_4
)bayesion_model_4 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x1;
vector<lower=1>[obs_count] x2;
vector<lower=1>[obs_count] x3;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real beta2;
real beta3;
real<lower=0> sigma;
}
model {
beta0 ~ normal(0, 8000);
beta1 ~ normal(0, 100);
beta2 ~ normal(0, 100);
beta3 ~ normal(0, 100);
sigma ~ exponential(0.1);
y ~ normal(beta0 + beta1 * x1 + beta2 * x2 + beta3 * x3, sigma);
}
",
data = list(
obs_count = nrow(train_set_4),
y = colvalues(train_set_4, 'body_mass_g'),
x1 = colvalues(train_set_4, 'bill_length_mm'),
x2 = colvalues(train_set_4, 'bill_depth_mm'),
x3 = colvalues(train_set_4, 'flipper_length_mm')
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)
params_4 <- c('beta0', 'beta1', 'beta2', 'beta3', 'sigma')
traceplot(bayesion_model_4, pars = params_4, inc_warmup = TRUE)Coeficientes de la regresión múltiple:
lineal_model_4$coefficients## (Intercept) bill_length_mm bill_depth_mm flipper_length_mm
## -8926.26327 24.74177 81.05387 52.87000
Coeficientes descubiertos por la regresión múltiple bayesiana:
for(param in params_4) print(get_posterior_mean(bayesion_model_4, par=param)[4])## [1] -7895.822
## [1] 26.30672
## [1] 53.77526
## [1] 49.6672
## [1] 236.508
vars_4 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm')
models_validation(lineal_model_4, bayesion_model_4, params_4, vars_4, test_set)bayesion_predictor_4 <- BayesianRegressionPredictor.from(bayesion_model_4, params_4, vars_4)
plot_compare_fit(
lineal_model_4,
bayesion_predictor_4,
train_set,
label_1='Regresion Lineal',
label_2='Regresion Bayesiana'
)Definimos una distribución uniforme para el beta asociado a la variable flipper_length_mm.
## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
br_vs_br_coeficients(bayesion_model_1, bayesion_model_5, params_5)models_validation(lineal_model_1, bayesion_model_1, params_1, vars_1, test_set)vars_5 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm')
models_validation(lineal_model_1, bayesion_model_5, params_5, vars_5, test_set)bayesion_predictor_5 <- BayesianRegressionPredictor.from(bayesion_model_5, params_5, vars_5)
plot_compare_fit(
bayesion_predictor_1,
bayesion_predictor_5,
train_set,
label_1='Regresion Bayesiana con dist informativa',
label_2='Regresion Bayesiana con dist menos informativa'
)COMPLETAR